devtools::install_github("sdam-au/sdam") # loading SDAM custom package, if not working try devtools::install_github("mplex/cedhar", subdir="pkg/sdam")
#devtools::install_github("mplex/cedhar", subdir="pkg/sdam")
library(tidyverse)
library(sdam)
library(jsonlite)
library(leaflet)
library(tidytext)
resp = request("EDH_text_cleaned_2020-10-09.json", path="/sharingin/648597@au.dk/SDAM_root/SDAM_data/EDH/public", method="GET", cred=mycred_secret)
list_json <- jsonlite::fromJSON(resp)
EDH_tibble <- as_tibble(list_json)
head(EDH_tibble)
clean_text_interpretive_word columnEDH_tokenized <- EDH_tibble %>%
unnest_tokens(word, clean_text_interpretive_word, token = stringr::str_split, pattern = " ") %>%
drop_na(word) %>%
print()
## # A tibble: 1,136,004 x 74
## responsible_ind… type_of_inscrip… letter_size not_after literature
## <chr> <chr> <chr> <chr> <chr>
## 1 Feraudi epitaph 3.2-2 cm 0130 AE 1983, …
## 2 Feraudi epitaph 3.2-2 cm 0130 AE 1983, …
## 3 Feraudi epitaph 3.2-2 cm 0130 AE 1983, …
## 4 Feraudi epitaph 3.2-2 cm 0130 AE 1983, …
## 5 Feraudi epitaph 3.2-2 cm 0130 AE 1983, …
## 6 Feraudi epitaph 3.2-2 cm 0130 AE 1983, …
## 7 Feraudi epitaph 3.2-2 cm 0130 AE 1983, …
## 8 Feraudi epitaph 3.2-2 cm 0130 AE 1983, …
## 9 Feraudi epitaph 3.2-2 cm 0130 AE 1983, …
## 10 Feraudi epitaph 3.2-2 cm 0130 AE 1983, …
## # … with 1,135,994 more rows, and 69 more variables: work_status <chr>,
## # height <chr>, diplomatic_text <chr>, people <list>, depth <chr>,
## # material <chr>, type_of_monument <chr>, province_label <chr>, width <chr>,
## # transcription <chr>, country <chr>, uri <chr>, findspot_ancient <chr>,
## # last_update <chr>, modern_region <chr>, findspot_modern <chr>,
## # language <chr>, id <chr>, edh_geography_uri <chr>, commentary <chr>,
## # trismegistos_uri <chr>, not_before <chr>, external_image_uris <list>,
## # fotos <list>, coordinates <list>, idno_tm <chr>, placenames_refs <list>,
## # text_edition <chr>, origdate_text <chr>, layout_execution <chr>,
## # layout_execution_text <chr>, support_objecttype <chr>,
## # support_objecttype_text <chr>, support_material <chr>,
## # support_material_text <chr>, support_decoration <chr>, keywords_term <chr>,
## # keywords_term_text <chr>, type_of_inscription_clean <chr>,
## # type_of_inscription_certainty <chr>, height_cm <dbl>, width_cm <dbl>,
## # depth_cm <dbl>, material_clean <chr>, type_of_monument_clean <chr>,
## # type_of_monument_certainty <chr>, province_label_clean <chr>,
## # province_label_certainty <chr>, country_clean <chr>,
## # country_certainty <chr>, findspot_ancient_clean <chr>,
## # findspot_ancient_certainty <chr>, modern_region_clean <chr>,
## # modern_region_certainty <chr>, findspot_modern_clean <chr>,
## # findspot_modern_certainty <chr>, findspot_clean <chr>,
## # findspot_certainty <chr>, origdate_text_clean <chr>,
## # clean_text_conservative <chr>, clean_text_interpretive_sentence <chr>,
## # findspot <chr>, year_of_find <chr>, present_location <chr>, religion <chr>,
## # geography <chr>, social_economic_legal_history <chr>, military <chr>,
## # word <chr>
EDH_tokenized %>% count(word, sort = TRUE) %>% filter(n > 5000) %>% mutate(word = reorder(word,
n)) %>% print()
## # A tibble: 22 x 2
## word n
## <fct> <int>
## 1 et 36364
## 2 dis 12165
## 3 manibus 12024
## 4 vixit 9544
## 5 in 8781
## 6 annos 7947
## 7 filius 7536
## 8 annorum 7051
## 9 est 6880
## 10 i 6827
## # … with 12 more rows
EDH_tokenized %>%
count(province_label_clean, word, sort = TRUE) %>%
group_by(province_label_clean) %>%
summarise(total = sum(n)) %>%
mutate(province_label_clean = reorder(province_label_clean, total)) -> words_total_province
## `summarise()` ungrouping output (override with `.groups` argument)
head(words_total_province)
words_total_province %>%
ggplot(aes(total, province_label_clean)) +
geom_col(fill = "darkblue", width = 0.7) +
theme_classic() +
labs(x = "Number of words", y = "Province name", title = "Number of total words on inscriptions per Roman province", subtitle = "EDH dataset, n = 81,476 inscriptions") +
theme_linedraw(base_size = 10)
EDH_tokenized %>%
count(province_label_clean, word, sort = TRUE) %>%
group_by(province_label_clean) %>%
filter(n > 1000) %>%
mutate(province_label_clean = reorder(province_label_clean, n)) %>%
ggplot(aes(y=province_label_clean, x=n)) +
geom_col(aes(fill=word), width=0.7) +
labs(x = "Number of words", y = "Province name", title = "The most common words on inscriptions per Roman province", subtitle = "EDH dataset, n = 81,476 inscriptions") +
theme_linedraw(base_size = 10)
EDH_tokenized %>%
count(type_of_inscription_clean, word, sort = TRUE) %>%
group_by(type_of_inscription_clean) %>%
filter(type_of_inscription_clean == "epitaph") %>%
filter(n > 1000) %>%
mutate(word = reorder(word, n)) -> words_epitaph
total_words_epitaph<- sum(words_epitaph$n)
words_epitaph %>%
ggplot(aes(y=word, x=n, color=n)) +
geom_col(width=0.7) +
scale_color_gradient(low="blue", high="red") +
theme_minimal() +
theme_linedraw(base_size = 9) +
labs(x = "Number of words", y = "Word", title = "The most common words on epitaphs", subtitle = "n = 123,039 words")
EDH_tokenized %>%
count(type_of_inscription_clean, word, sort = TRUE) %>%
group_by(type_of_inscription_clean) %>%
filter(type_of_inscription_clean == "mile-/leaguestone") %>%
filter(n > 100) %>%
mutate(word = reorder(word, n)) -> words_milestone
words_milestone
total_words_milestone <- sum(words_milestone$n)
words_milestone %>%
ggplot(aes(y=word, x=n, color=n)) +
geom_col(width=0.6) +
scale_color_gradient(low="blue", high="red") +
theme_minimal() +
theme_linedraw(base_size = 9) +
labs(x = "Number of words", y = "Word", title = "The most common words on milestones", subtitle = "n = 24,986 words")
ggsave(filename = "../figures/EDH_milestone_common_words.png", width = 8, height = 8)
EDH_tokenized %>%
filter(type_of_inscription_clean == "mile-/leaguestone") %>%
count(province_label_clean, word, sort = TRUE) %>%
group_by(province_label_clean) %>%
filter(n > 50) %>%
mutate(province_label_clean = reorder(province_label_clean, n)) %>%
ggplot(aes(y=province_label_clean, x=n)) +
geom_col(aes(fill=word), width=0.7) +
labs(x = "Number of words", y = "Province name", title = "The most common words on milestones per Roman province", subtitle = "EDH dataset, n = 81,476 inscriptions") +
theme_linedraw(base_size = 10)
library(wordcloud)
## Loading required package: RColorBrewer
EDH_tokenized %>%
filter(type_of_inscription_clean == "mile-/leaguestone") %>%
count(province_label_clean, word, sort = TRUE) %>%
group_by(province_label_clean) %>%
filter(n > 50) %>%
mutate(province_label_clean = reorder(province_label_clean, n)) %>%
with(wordcloud(word, n, max.words = 200))
# Frequency of words Source: https://www.tidytextmining.com/tfidf.html Using term frequency and inverse document frequency allows us to find words that are characteristic for one document within a collection of documents.
insc_types_words <- EDH_tokenized %>%
count(type_of_inscription_clean, word, sort = TRUE)
total_words <- insc_types_words %>%
group_by(type_of_inscription_clean) %>%
summarize(total = sum(n))
## `summarise()` ungrouping output (override with `.groups` argument)
insc_types_words <- left_join(insc_types_words, total_words)
## Joining, by = "type_of_inscription_clean"
insc_types_words
ggplot(insc_types_words, aes(n/total, fill = type_of_inscription_clean)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.0009) +
facet_wrap(~type_of_inscription_clean, ncol = 4, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4509 rows containing non-finite values (stat_bin).
## Warning: Removed 18 rows containing missing values (geom_bar).
ggsave(filename = "../figures/EDH_freq_words_insc_type.png", width = 8, height = 8)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4509 rows containing non-finite values (stat_bin).
## Warning: Removed 18 rows containing missing values (geom_bar).
freq_by_rank <- insc_types_words %>%
group_by(type_of_inscription_clean) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
freq_by_rank
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = type_of_inscription_clean)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = TRUE) +
scale_x_log10() +
scale_y_log10()
rank_subset <- freq_by_rank %>%
filter(rank < 500,
rank > 10)
lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Coefficients:
## (Intercept) log10(rank)
## -0.9723 -0.9194
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = type_of_inscription_clean)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = TRUE) +
geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
scale_x_log10() +
scale_y_log10()
Commentary: EDH corpus uses a lower percentage of the most common words than many collections of language.
insc_types_words <- insc_types_words %>%
bind_tf_idf(word, type_of_inscription_clean, n)
insc_types_words
insc_types_words %>%
select(-total) %>%
arrange(desc(tf_idf))
insc_types_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(type_of_inscription_clean) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = type_of_inscription_clean)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~type_of_inscription_clean, ncol = 4, scales = "free_y") +
coord_flip() +
labs(x = "word", y = "tf-idf", title = "Term frequency - inverse document frequency (tf-idf) by type of inscription", subtitle = "EDH dataset, n = 81,476 inscriptions") +
theme_linedraw(base_size = 10)
## Selecting by tf_idf
ggsave("../figures/EDH_tf_idf_insc_type.png", width = 16, height = 16)
mystopwords <- tibble(word = c("et", "in", "qui", "i", "v", "ii", "ex"))
insc_bigrams <- EDH_tibble %>%
select(clean_text_interpretive_word, type_of_inscription_clean, province_label_clean) %>%
unnest_tokens(bigram, clean_text_interpretive_word, token = "ngrams", n = 2)
head(insc_bigrams)
insc_bigrams %>%
count(bigram, sort = TRUE)
bigrams_separated <- insc_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_separated %>%
count(word1, word2, sort = TRUE)
What other words occur together with the word passuum.
bigrams_separated %>%
filter(word2 == "passuum") %>%
count(type_of_inscription_clean, word1, sort = TRUE)
bigrams_separated %>%
filter(word1 == "passuum") %>%
count(type_of_inscription_clean, word2, sort = TRUE)
bigram_tf_idf <- insc_bigrams%>%
count(type_of_inscription_clean, bigram) %>%
bind_tf_idf(bigram, type_of_inscription_clean, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf
bigram_tf_idf %>%
arrange(desc(tf_idf)) %>%
mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>%
group_by(type_of_inscription_clean) %>%
top_n(10) %>%
ungroup() %>%
ggplot(aes(bigram, tf_idf, fill = type_of_inscription_clean)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~type_of_inscription_clean, ncol = 3, scales = "free_y") +
coord_flip() +
theme_linedraw(base_size = 10)
## Selecting by tf_idf
ggsave("../figures/EDH_bigrams_tf_idf_insc_type.png", width = 20, height = 20)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
bigram_graph<- bigrams_separated %>%
count(word1, word2, sort = TRUE) %>%
filter(n > 500) %>%
graph_from_data_frame()
bigram_graph
## IGRAPH 9fea7f5 DN-- 105 87 --
## + attr: name (v/c), n (e/n)
## + edges from 9fea7f5 (vertex names):
## [1] dis ->manibus vixit ->annos votum ->solvit
## [4] solvit ->libens tribunicia->potestate manibus ->sacrum
## [7] libens ->merito hic ->situs situs ->est
## [10] bene ->merenti iovi ->optimo optimo ->maximo
## [13] sibi ->et vixit ->annis imperatori->caesari
## [16] imperator ->caesar hic ->sita pro ->salute
## [19] terra ->levis sita ->est tibi ->terra
## [22] et ->i sit ->tibi pro ->praetore
## + ... omitted several edges
library(ggraph)
set.seed(1000)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
ggsave("../figures/EDH_bigrams_networks.png", width = 10, height = 10)
set.seed(1000)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 4) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
ggsave("../figures/EDH_bigrams_networks_2.png", width = 10, height = 10)
insc_trigram <- EDH_tibble %>%
select(clean_text_interpretive_word, type_of_inscription_clean, province_label_clean) %>%
unnest_tokens(trigram, clean_text_interpretive_word, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
count(word1, word2, word3, sort = TRUE)
insc_trigram
library(widyr)
# count words co-occuring within sections
word_pairs<- EDH_tokenized %>%
pairwise_count(word, id, sort = TRUE)
## Warning: `distinct_()` is deprecated as of dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
word_pairs
word_pairs %>%
filter(item1 == "votum")
Correlation among words, which indicates how often they appear together relative to how often they appear separately.
word_cors <- EDH_tokenized %>%
group_by(word) %>%
filter(n() >= 100) %>%
pairwise_cor(word, id, sort = TRUE)
word_cors
word_cors %>%
filter(item1 == "votum")
# how many words has milestone
sum(EDH_dfm["mile-/leaguestone",])
## [1] 39424
max(EDH_dfm["mile-/leaguestone",])
## [1] 1102
milestone <- EDH_dfm["mile-/leaguestone",]